1 STRATAA microbiome analysis

1.1 Imports

library(rbiom)
library(kableExtra)
library(ggplot2)
library(ggpubr)
library(ggforce)
library(patchwork)
library(vegan)
library(dplyr)
library(forcats)

1.2 Sources

The file handles are set in config.R as they’re used by both this script and data_cleaning.

source("/Users/flashton/Dropbox/GordonGroup/STRATAA_Microbiome/from_Leo/Leonardos_analysis/bin/core_functions.R")
source("/Users/flashton/Dropbox/GordonGroup/STRATAA_Microbiome/from_Leo/Leonardos_analysis/bin/config.R")

1.3 Metadata basics

First do the plots with kruskall wallis comparison between groups to see if there’s an overall difference, then do the pairwise tests because KW says there is a difference.

metadata <- read_metadata(metadata_handle)

metadata <- metadata %>% mutate(Group = if_else(Group == 'Control_HealthySerosurvey', 'Healthy Control', Group))
metadata <- metadata %>% mutate(Group = if_else(Group == 'Acute_Typhi', 'Acute typhoid', Group))
number_per_country <- metadata %>% group_by(Country) %>% summarise(count = n())

number_per_country %>% kbl() %>% kable_styling()
Country count
Bangladesh 120
Malawi 114
Nepal 76
print(sum(number_per_country$count))
## [1] 310
metadata %>% group_by(Group) %>% summarise(count = n()) %>% kbl() %>% kable_styling()
Group count
Acute typhoid 103
Carrier 110
Healthy Control 97
baseline_chars <- get_baseline_characteristics(metadata)

baseline_chars$pct_anti
## [1] 37.50000  0.00000  0.00000 26.47059  0.00000  0.00000 44.82759  0.00000
## [9]  0.00000
baseline_chars$pct_anti %>% na_if(0)
## [1] 37.50000       NA       NA 26.47059       NA       NA 44.82759       NA
## [9]       NA
# BEWARE - na_if(0.0) will convert ALL 0s to NAs, this is ok as they're only in the antibiotics of carriers and controls
# at the moment, but if others get added in, will screw with it.
# baseline_chars %>% rename(`Median age` = median_age, `Women (%)` = pct_fem, `Antibiotics in last 2 weeks (%)` = pct_anti) %>% pivot_longer(!c(Country, Group)) %>% rename(variable_name = name) %>% pivot_wider(names_from = Group, values_from = value) %>% filter(variable_name != 'number') %>% na_if(0.0) %>% kbl(digits = c(NA, NA, 1, 1, 1)) %>% kable_styling()

# na_if stopped working, so had to do this instead
baseline_chars_table <- baseline_chars %>% rename(`Median age` = median_age, `Women (%)` = pct_fem, `Antibiotics in last 2 weeks (%)` = pct_anti) %>% pivot_longer(!c(Country, Group)) %>% rename(variable_name = name) %>% pivot_wider(names_from = Group, values_from = value) %>% filter(variable_name != 'number') 

baseline_chars_table$Carrier <- replace(baseline_chars_table$Carrier,which(baseline_chars_table$Carrier==0),NA)
baseline_chars_table$`Healthy Control` <- replace(baseline_chars_table$`Healthy Control`,which(baseline_chars_table$`Healthy Control`==0),NA)
baseline_chars_table %>% kbl(digits = c(NA, NA, 1, 1, 1)) %>% kable_styling()
Country variable_name Acute typhoid Carrier Healthy Control
Bangladesh Median age 6.0 37.0 28.5
Bangladesh Women (%) 47.5 47.5 65.0
Bangladesh Antibiotics in last 2 weeks (%) 37.5 NA NA
Malawi Median age 9.7 32.0 24.0
Malawi Women (%) 64.7 57.5 65.0
Malawi Antibiotics in last 2 weeks (%) 26.5 NA NA
Nepal Median age 17.2 43.9 35.0
Nepal Women (%) 24.1 66.7 82.4
Nepal Antibiotics in last 2 weeks (%) 44.8 NA NA
ggplot(metadata, aes(x = Country, y = Age, fill = Group)) + geom_boxplot() + stat_compare_means(method = 'kruskal.test', label = "p")

comparisons_groups <- list(c("Acute typhoid", "Carrier"), c("Acute typhoid", "Healthy Control"), c("Carrier", "Healthy Control"))
comparisons_countries <- list(c('Bangladesh', 'Malawi'), c('Bangladesh', 'Nepal'), c('Malawi', 'Nepal'))
# default stat method when doing pairwise tests in wilcoxon.
ggboxplot(metadata, facet.by = "Country", y = "Age", x = "Group", color = "Group") + stat_compare_means(comparisons = comparisons_groups) + rremove("x.text") + rremove("xlab") + rremove("x.ticks") # +rotate_x_text(angle = 45)

ggboxplot(metadata, facet.by = "Group", y = "Age", x = "Country", color = "Country") + stat_compare_means(comparisons = comparisons_countries) + rotate_x_text(angle = 45)

country_group_sex <- metadata %>% group_by(Country, Group, Sex) %>% summarise(count = n()) 

plot_sex <- function(eg1, c){
  d <- eg1 %>% filter(Country == c)
  p <- ggplot(d, aes(x = Group, y = count, fill = Sex)) + 
    geom_bar(stat ='identity', position = 'fill') + 
    ylab('Proportion') + 
    ggtitle(c)# +
    #theme(axis.text=element_text(size=34), axis.title=element_text(size=36,face="bold"), plot.title = element_text(size = 40, face = "bold"), legend.key.size = unit(4, 'cm'), legend.title = element_text(size = 34), legend.text = element_text(size = 28))
  
  return(p)
}

m_sex <- plot_sex(country_group_sex, 'Malawi')
b_sex <- plot_sex(country_group_sex, 'Bangladesh')
n_sex <- plot_sex(country_group_sex, 'Nepal')
m_sex / b_sex / n_sex 

1.4 read in metaphlan data

strataa_metaphlan_data <- read.csv(file = file.path(metaphlan_input_folder, '2023.05.11.all_strataa_metaphlan.csv'), header= TRUE, sep = ",", row.names = 1, stringsAsFactors = FALSE, check.names=FALSE)
strataa_metaphlan_data$lowest_taxonomic_level <- sapply(str_split(row.names(strataa_metaphlan_data), "\\|"), function(x) x[length(x)])

strataa_metaphlan_metadata <- read.csv(file = file.path(metaphlan_input_folder, '2023.05.11.strataa_metadata.metaphlan.csv'), header = TRUE, sep = ",", row.names = 1, stringsAsFactors = FALSE)
strataa_metaphlan_metadata <- strataa_metaphlan_metadata %>% mutate(SampleID = rownames(strataa_metaphlan_metadata))
strataa_metaphlan_metadata <- strataa_metaphlan_metadata %>% mutate(age_bracket=cut(Age, breaks=c(0, 1, 5, 15, Inf), labels=c("<1", "1-5", "6-15", ">15")))

1.5 Alpha diversity - metaphlan

Alpha diversity - all countries, all groups

all_countries_all_groups_alpha <- metaphlan_alpha(strataa_metaphlan_data, strataa_metaphlan_metadata, countries_of_interest = c('Bangladesh', 'Malawi', 'Nepal'), groups_of_interest = c('Acute_Typhi', 'Carrier', 'Control_HealthySerosurvey'), comparisons = list(c('Acute_Typhi', 'Carrier'), c('Acute_Typhi', 'Control_HealthySerosurvey'), c('Control_HealthySerosurvey', 'Carrier')))

all_countries_all_groups_alpha$alpha_anova_summary_with_var_names %>% dplyr::mutate_if(is.numeric, funs(as.character(signif(., 3)))) %>% kbl() %>% kable_styling()
rownames(alpha_anova_summary[[1]]) Df Sum.Sq Mean.Sq F.value Pr..F. is_it_significant
Country:Group 4 7.83 1.96 12.4 3.09e-09 significant
Group 2 5.92 2.96 18.7 2.55e-08 significant
Country 2 2.75 1.38 8.7 0.00022 significant
Country:Age:Antibiotics_taken_before_sampling_yes_no_assumptions 2 1.83 0.913 5.77 0.00352 significant
Country:Sex:Antibiotics_taken_before_sampling_yes_no_assumptions 2 1.25 0.625 3.95 0.0204 not_significant
Sex:Group 2 1.13 0.566 3.58 0.0292 not_significant
Country:Sex 2 1.11 0.553 3.49 0.0318 not_significant
Country:Sex:Age 2 1.03 0.517 3.27 0.0396 not_significant
Country:Antibiotics_taken_before_sampling_yes_no_assumptions 2 0.835 0.417 2.64 0.0733 not_significant
Country:Age 2 0.576 0.288 1.82 0.164 not_significant
Country:Group:Age 4 1.03 0.256 1.62 0.169 not_significant
Sex:Age 1 0.298 0.298 1.88 0.171 not_significant
Antibiotics_taken_before_sampling_yes_no_assumptions 2 0.474 0.237 1.5 0.226 not_significant
Country:Sex:Group 4 0.817 0.204 1.29 0.274 not_significant
Age:Antibiotics_taken_before_sampling_yes_no_assumptions 2 0.318 0.159 1.01 0.367 not_significant
Group:Age 2 0.239 0.119 0.754 0.471 not_significant
Sex:Age:Antibiotics_taken_before_sampling_yes_no_assumptions 1 0.0529 0.0529 0.334 0.564 not_significant
Sex:Antibiotics_taken_before_sampling_yes_no_assumptions 1 0.0476 0.0476 0.301 0.584 not_significant
Sex 1 0.0294 0.0294 0.186 0.667 not_significant
Sex:Group:Age 2 0.051 0.0255 0.161 0.851 not_significant
Country:Sex:Group:Age 4 0.127 0.0317 0.201 0.938 not_significant
Country:Sex:Age:Antibiotics_taken_before_sampling_yes_no_assumptions 1 0.000136 0.000136 0.000861 0.977 not_significant
Age 1 1.07e-08 1.07e-08 6.75e-08 1 not_significant
Residuals 261 41.3 0.158 NA NA NA
all_countries_all_groups_alpha$alpha_plot_group

all_countries_all_groups_alpha$alpha_plot_antibiotics

Alpha diversity - all countries, healthy and acute

all_countries_healthy_acute_alpha <- metaphlan_alpha(strataa_metaphlan_data, strataa_metaphlan_metadata, countries_of_interest = c('Bangladesh', 'Malawi', 'Nepal'), groups_of_interest = c('Acute_Typhi', 'Control_HealthySerosurvey'), comparisons = list(c('Acute_Typhi', 'Control_HealthySerosurvey')))

all_countries_healthy_acute_alpha$alpha_by_country

all_countries_healthy_acute_alpha$alpha_anova_summary_with_var_names %>% dplyr::mutate_if(is.numeric, funs(as.character(signif(., 3)))) %>% kbl() %>% kable_styling()
rownames(alpha_anova_summary[[1]]) Df Sum.Sq Mean.Sq F.value Pr..F. is_it_significant
Country 2 5.74 2.87 17.4 1.36e-07 significant
Country:Age:Antibiotics_taken_before_sampling_yes_no_assumptions 2 1.68 0.842 5.12 0.00699 significant
Sex:Age 1 1.07 1.07 6.5 0.0117 not_significant
Country:Sex 2 1.5 0.75 4.56 0.0119 not_significant
Country:Sex:Antibiotics_taken_before_sampling_yes_no_assumptions 2 1.39 0.693 4.21 0.0165 not_significant
Country:Age 2 1.02 0.51 3.1 0.0477 not_significant
Sex:Group 1 0.628 0.628 3.82 0.0525 not_significant
Country:Antibiotics_taken_before_sampling_yes_no_assumptions 2 0.816 0.408 2.48 0.0871 not_significant
Country:Group 2 0.698 0.349 2.12 0.123 not_significant
Antibiotics_taken_before_sampling_yes_no_assumptions 2 0.618 0.309 1.88 0.157 not_significant
Group 1 0.164 0.164 0.997 0.319 not_significant
Age:Antibiotics_taken_before_sampling_yes_no_assumptions 2 0.359 0.179 1.09 0.339 not_significant
Sex 1 0.0592 0.0592 0.36 0.55 not_significant
Country:Sex:Age 2 0.188 0.0938 0.57 0.567 not_significant
Sex:Age:Antibiotics_taken_before_sampling_yes_no_assumptions 1 0.0511 0.0511 0.31 0.578 not_significant
Sex:Group:Age 1 0.0368 0.0368 0.224 0.637 not_significant
Group:Age 1 0.0147 0.0147 0.0891 0.766 not_significant
Country:Group:Age 2 0.0713 0.0357 0.217 0.805 not_significant
Age 1 0.00772 0.00772 0.0469 0.829 not_significant
Country:Sex:Group 2 0.0595 0.0297 0.181 0.835 not_significant
Sex:Antibiotics_taken_before_sampling_yes_no_assumptions 1 0.00675 0.00675 0.041 0.84 not_significant
Country:Sex:Group:Age 2 0.05 0.025 0.152 0.859 not_significant
Country:Sex:Age:Antibiotics_taken_before_sampling_yes_no_assumptions 1 0.000136 0.000136 0.000828 0.977 not_significant
Residuals 163 26.8 0.165 NA NA NA
all_countries_healthy_acute_alpha$alpha_plot_group

all_countries_healthy_acute_alpha$alpha_plot_antibiotics

alpha diversity, malawi only

malawi_healthy_acute_alpha <- metaphlan_alpha(strataa_metaphlan_data, strataa_metaphlan_metadata, countries_of_interest = c('Malawi'), groups_of_interest = c('Acute_Typhi', 'Control_HealthySerosurvey'), comparisons = list(c('Acute_Typhi', 'Control_HealthySerosurvey')))

malawi_healthy_acute_alpha$alpha_anova_summary_with_var_names %>% dplyr::mutate_if(is.numeric, funs(as.character(signif(., 3)))) %>% kbl() %>% kable_styling()
rownames(alpha_anova_summary[[1]]) Df Sum.Sq Mean.Sq F.value Pr..F. is_it_significant
Age:Antibiotics_taken_before_sampling_yes_no_assumptions 1 2.27 2.27 16 0.000167 significant
Antibiotics_taken_before_sampling_yes_no_assumptions 1 1.31 1.31 9.27 0.0034 significant
Sex 1 1.16 1.16 8.21 0.00567 significant
Group 1 0.84 0.84 5.93 0.0177 not_significant
Age 1 0.451 0.451 3.19 0.079 not_significant
Sex:Antibiotics_taken_before_sampling_yes_no_assumptions 1 0.406 0.406 2.87 0.0952 not_significant
Sex:Group 1 0.313 0.313 2.21 0.142 not_significant
Sex:Group:Age 1 0.0978 0.0978 0.691 0.409 not_significant
Sex:Age 1 0.0164 0.0164 0.116 0.734 not_significant
Group:Age 1 0.00122 0.00122 0.00864 0.926 not_significant
Residuals 63 8.91 0.141 NA NA NA
malawi_healthy_acute_alpha$alpha_plot_group

malawi_healthy_acute_alpha$alpha_plot_antibiotics

1.6 Beta diversity - metaphlan

All groups.

all_countries_beta_all_groups <- metaphlan_beta(strataa_metaphlan_data, strataa_metaphlan_metadata, c('Bangladesh', 'Malawi', 'Nepal'), c('Acute_Typhi', 'Carrier', 'Control_HealthySerosurvey'))
all_countries_beta_all_groups$pn_res %>% kbl %>% kable_styling()
R2 Pr(>F) is_it_significant
Group 0.0759082 0.0009990 significant
Group:Age 0.0121511 0.0059940 significant
Sex:Group:Age 0.0081783 0.1028971 not_significant
Sex:Group 0.0079033 0.1058941 not_significant
Antibiotics_taken_before_sampling_yes_no_assumptions 0.0076947 0.1358641 not_significant
Age 0.0038070 0.2057942 not_significant
Sex:Antibiotics_taken_before_sampling_yes_no_assumptions 0.0035541 0.2257742 not_significant
Sex:Age 0.0035425 0.2407592 not_significant
Sex 0.0035891 0.2417582 not_significant
Sex:Age:Antibiotics_taken_before_sampling_yes_no_assumptions 0.0027831 0.4825175 not_significant
Age:Antibiotics_taken_before_sampling_yes_no_assumptions 0.0045854 0.7812188 not_significant
Total 1.0000000 NA NA
Residual 0.8663032 NA NA
bgd_beta_all_groups <- metaphlan_beta(strataa_metaphlan_data, strataa_metaphlan_metadata, c('Bangladesh'), c('Acute_Typhi', 'Carrier', 'Control_HealthySerosurvey'))
bgd_beta_all_groups$pn_res %>% kbl %>% kable_styling()
R2 Pr(>F) is_it_significant
Group 0.2679584 0.0009990 significant
Antibiotics_taken_before_sampling_yes_no_assumptions 0.0124851 0.0369630 not_significant
Sex:Age 0.0093987 0.1308691 not_significant
Age 0.0084979 0.1678322 not_significant
Sex:Group 0.0155821 0.2037962 not_significant
Sex 0.0078067 0.2157842 not_significant
Group:Age 0.0115137 0.5164835 not_significant
Sex:Group:Age 0.0104905 0.6363636 not_significant
Age:Antibiotics_taken_before_sampling_yes_no_assumptions 0.0047215 0.6893107 not_significant
Sex:Antibiotics_taken_before_sampling_yes_no_assumptions 0.0044103 0.6933067 not_significant
Sex:Age:Antibiotics_taken_before_sampling_yes_no_assumptions 0.0035306 0.8511489 not_significant
Total 1.0000000 NA NA
Residual 0.6436043 NA NA
# bgd_beta_all_groups$pc12
# bgd_beta_all_groups$pc34

mal_beta_all_groups <- metaphlan_beta(strataa_metaphlan_data, strataa_metaphlan_metadata, c('Malawi'), c('Acute_Typhi', 'Carrier', 'Control_HealthySerosurvey'))
mal_beta_all_groups$pn_res %>% kbl %>% kable_styling()
R2 Pr(>F) is_it_significant
Group 0.1942771 0.0009990 significant
Antibiotics_taken_before_sampling_yes_no_assumptions 0.0247209 0.0009990 significant
Age:Antibiotics_taken_before_sampling_yes_no_assumptions 0.0225358 0.0009990 significant
Sex:Antibiotics_taken_before_sampling_yes_no_assumptions 0.0169857 0.0069930 significant
Sex:Age 0.0138711 0.0149850 not_significant
Sex:Group 0.0217097 0.0229770 not_significant
Group:Age 0.0223640 0.0289710 not_significant
Age 0.0114622 0.0459540 not_significant
Sex 0.0075054 0.2767233 not_significant
Sex:Group:Age 0.0129653 0.4505495 not_significant
Total 1.0000000 NA NA
Residual 0.6516027 NA NA
# mal_beta_all_groups$pc12
# mal_beta_all_groups$pc34

nep_beta_all_groups <- metaphlan_beta(strataa_metaphlan_data, strataa_metaphlan_metadata, c('Nepal'), c('Acute_Typhi', 'Carrier', 'Control_HealthySerosurvey'))
nep_beta_all_groups$pn_res %>% kbl %>% kable_styling()
R2 Pr(>F) is_it_significant
Group 0.0898195 0.0009990 significant
Sex 0.0176933 0.1438561 not_significant
Sex:Age 0.0148693 0.2607393 not_significant
Age 0.0125478 0.4615385 not_significant
Sex:Age:Antibiotics_taken_before_sampling_yes_no_assumptions 0.0124906 0.4785215 not_significant
Sex:Group 0.0239383 0.5744256 not_significant
Antibiotics_taken_before_sampling_yes_no_assumptions 0.0228450 0.6333666 not_significant
Sex:Antibiotics_taken_before_sampling_yes_no_assumptions 0.0106433 0.6473526 not_significant
Sex:Group:Age 0.0169339 0.9190809 not_significant
Group:Age 0.0162360 0.9620380 not_significant
Age:Antibiotics_taken_before_sampling_yes_no_assumptions 0.0102948 0.9980020 not_significant
Total 1.0000000 NA NA
Residual 0.7516884 NA NA
all_countries_beta_all_groups$pc12 / bgd_beta_all_groups$pc12 / nep_beta_all_groups$pc12

Acute vs healthy.

all_countries_beta_acute_healthy <- metaphlan_beta(strataa_metaphlan_data, strataa_metaphlan_metadata, c('Bangladesh', 'Malawi', 'Nepal'), c('Acute_Typhi', 'Control_HealthySerosurvey'))
all_countries_beta_acute_healthy$pn_res %>% kbl %>% kable_styling()
R2 Pr(>F) is_it_significant
Group 0.0285252 0.0009990 significant
Group:Age 0.0117687 0.0149850 not_significant
Age 0.0092014 0.0369630 not_significant
Sex:Group:Age 0.0072077 0.1048951 not_significant
Sex 0.0070879 0.1108891 not_significant
Antibiotics_taken_before_sampling_yes_no_assumptions 0.0117438 0.2027972 not_significant
Sex:Age 0.0058394 0.2297702 not_significant
Sex:Group 0.0057923 0.2567433 not_significant
Sex:Antibiotics_taken_before_sampling_yes_no_assumptions 0.0055130 0.2977023 not_significant
Sex:Age:Antibiotics_taken_before_sampling_yes_no_assumptions 0.0043096 0.4825175 not_significant
Age:Antibiotics_taken_before_sampling_yes_no_assumptions 0.0070480 0.8291708 not_significant
Total 1.0000000 NA NA
Residual 0.8959630 NA NA
bgd_beta_acute_healthy <- metaphlan_beta(strataa_metaphlan_data, strataa_metaphlan_metadata, c('Bangladesh'), c('Acute_Typhi', 'Control_HealthySerosurvey'))
bgd_beta_acute_healthy$pn_res %>% kbl %>% kable_styling()
R2 Pr(>F) is_it_significant
Group 0.0782718 0.0009990 significant
Sex 0.0230592 0.0399600 not_significant
Antibiotics_taken_before_sampling_yes_no_assumptions 0.0199564 0.0799201 not_significant
Sex:Age 0.0168558 0.1288711 not_significant
Age 0.0146689 0.2217782 not_significant
Group:Age 0.0109377 0.5044955 not_significant
Sex:Group:Age 0.0100500 0.5704296 not_significant
Age:Antibiotics_taken_before_sampling_yes_no_assumptions 0.0080933 0.7512488 not_significant
Sex:Antibiotics_taken_before_sampling_yes_no_assumptions 0.0074906 0.8341658 not_significant
Sex:Group 0.0067483 0.8841159 not_significant
Sex:Age:Antibiotics_taken_before_sampling_yes_no_assumptions 0.0059228 0.9240759 not_significant
Total 1.0000000 NA NA
Residual 0.7979452 NA NA
mal_beta_acute_healthy <- metaphlan_beta(strataa_metaphlan_data, strataa_metaphlan_metadata, c('Malawi'), c('Acute_Typhi', 'Control_HealthySerosurvey'))
mal_beta_acute_healthy$pn_res %>% kbl %>% kable_styling()
R2 Pr(>F) is_it_significant
Group 0.1995900 0.0009990 significant
Age:Antibiotics_taken_before_sampling_yes_no_assumptions 0.0371210 0.0009990 significant
Antibiotics_taken_before_sampling_yes_no_assumptions 0.0381931 0.0029970 significant
Sex:Antibiotics_taken_before_sampling_yes_no_assumptions 0.0267524 0.0069930 significant
Sex:Group 0.0200496 0.0269730 not_significant
Age 0.0194908 0.0309690 not_significant
Sex:Age 0.0144797 0.1348651 not_significant
Sex 0.0141068 0.1448551 not_significant
Sex:Group:Age 0.0091754 0.4585415 not_significant
Group:Age 0.0078947 0.5954046 not_significant
Total 1.0000000 NA NA
Residual 0.6131466 NA NA
nep_beta_acute_healthy <- metaphlan_beta(strataa_metaphlan_data, strataa_metaphlan_metadata, c('Nepal'), c('Acute_Typhi', 'Control_HealthySerosurvey'))
nep_beta_acute_healthy$pn_res %>% kbl %>% kable_styling()
R2 Pr(>F) is_it_significant
Group 0.0598990 0.0039960 significant
Sex 0.0484279 0.0089910 significant
Sex:Group 0.0246755 0.2917083 not_significant
Sex:Age:Antibiotics_taken_before_sampling_yes_no_assumptions 0.0244159 0.3356643 not_significant
Antibiotics_taken_before_sampling_yes_no_assumptions 0.0450942 0.4025974 not_significant
Sex:Antibiotics_taken_before_sampling_yes_no_assumptions 0.0207951 0.5044955 not_significant
Sex:Group:Age 0.0175717 0.6853147 not_significant
Age 0.0154038 0.8171828 not_significant
Sex:Age 0.0148604 0.8351648 not_significant
Group:Age 0.0102689 0.9800200 not_significant
Age:Antibiotics_taken_before_sampling_yes_no_assumptions 0.0188359 0.9960040 not_significant
Total 1.0000000 NA NA
Residual 0.6997518 NA NA
all_countries_beta_acute_healthy$pc12 + bgd_beta_acute_healthy$pc12 + mal_beta_acute_healthy$pc12 + nep_beta_acute_healthy$pc12

1.7 Plot phyla bar plots

# get the taxa that are phyla, not classes, or below (species etc), and tidy the data.
phyla <- strataa_metaphlan_data %>% mutate(clade_name = rownames(strataa_metaphlan_data)) %>% filter(grepl("p__", clade_name)) %>% filter(!grepl("c__", clade_name)) %>% pivot_longer(!c(clade_name, lowest_taxonomic_level), names_to = "sample", values_to = "relative_abundance")

# relative_abundance > 1 returns a list of TRUE/FALSE values, which is then summed to get the number of samples in which the phylum is present at > 1% relative abundance.
# then we filter to only keep phyla that are present at 1% in at least 10% of samples.
phyla_to_exclude <- phyla %>% group_by(clade_name) %>% 
    summarise(count = sum(relative_abundance > 1)) %>% 
    filter(count < (length(unique(phyla$sample)) / 10)) %>% 
    pull(clade_name)
View(phyla_to_exclude)

# in order to make each sample add up to 100, we need to add the excluded taxa back in as a single "rare taxa" phylum.
# first we need to calculate the relative abundance of the excluded taxa in each sample.
excluded_phyla <- phyla %>%
  filter(clade_name %in% phyla_to_exclude) %>% group_by(sample) %>% summarise(relative_abundance = sum(relative_abundance))
# then make a column with the name "rare_taxa" for each sample, annd bind it to the excluded taxa data.
rare_taxa_column <- data.frame(lowest_taxonomic_level = c(rep("rare_taxa", nrow(excluded_phyla))), clade_name = c(rep("rare_taxa", nrow(excluded_phyla))))
excluded_phyla <- cbind(rare_taxa_column, excluded_phyla)

# then remove the excluded taxa from the phyla data.
phyla_clean <- phyla %>%
  filter(!(clade_name %in% phyla_to_exclude))
# and add the excluded taxa back in.
phyla_clean <- rbind(phyla_clean, excluded_phyla)
View(phyla_clean)
View(excluded_phyla)

colnames(strataa_metaphlan_metadata)
## [1] "Group"                                               
## [2] "Sex"                                                 
## [3] "Country"                                             
## [4] "Age"                                                 
## [5] "Antibiotics_taken_before_sampling_yes_no_assumptions"
## [6] "sequencing_lane"                                     
## [7] "SampleID"                                            
## [8] "age_bracket"
metadata_select <- strataa_metaphlan_metadata %>% dplyr::select(SampleID, Group, Country)


phyla_clean_metadata <- phyla_clean %>% left_join(metadata_select, by = c("sample" = "SampleID"))
phyla_clean_metadata <- phyla_clean_metadata %>% mutate(Group = ifelse(Group == "Acute_Typhi", "Typhi", Group)) %>% mutate(Group = ifelse(Group == "Control_HealthySerosurvey", "Healthy", Group))

order_of_groups <- c("Typhi", "Healthy")

plot_per_country_abundance <- function(phyla_clean_metadata, country, group_order){
    # thanks chatgpt!
    phyla_clean_country <- phyla_clean_metadata %>% filter(Country == country) %>% filter(Group %in% group_order)

    phyla_clean_country_fct <- phyla_clean_country %>%
    mutate(Group = factor(Group, levels = group_order),  # Convert group to a factor with the desired order
            group_order_numeric = as.numeric(Group),  # Create a new numeric variable based on the order of group
            sample = fct_reorder(sample, group_order_numeric))  # Reorder sample based on group_order_numeric

    p <- ggplot(data = phyla_clean_country_fct, aes(x = sample, y = relative_abundance, fill = clade_name)) + 
        geom_bar(stat = "identity") + 
        theme(axis.text.x = element_text(angle = 90, hjust = 1), axis.title.x = element_blank()) + 
        scale_x_discrete(breaks=phyla_clean_country_fct$sample, labels=phyla_clean_country_fct$Group) + 
        ggtitle(country) +
        ylim(0, 100.01)
    p
}

bangladesh_phyla_plot <- plot_per_country_abundance(phyla_clean_metadata = phyla_clean_metadata, country = "Bangladesh", group_order = order_of_groups)
# bangladesh_phyla_plot
malawi_phyla_plot <- plot_per_country_abundance(phyla_clean_metadata = phyla_clean_metadata, country = "Malawi", group_order = order_of_groups)
nepal_phyla_plot <- plot_per_country_abundance(phyla_clean_metadata = phyla_clean_metadata, country = "Nepal", group_order = order_of_groups)

bangladesh_phyla_plot / malawi_phyla_plot / nepal_phyla_plot

# going back to phyla so that get all the weird rare ones.
# need to join to meta-data
# remove carriers
# group by clade, group, country
# summarise to get the median.
# pivot wider to get the table.
# dplyr::select(order(colnames(.))) rearranges the columns in alphabetical order
# relocate moves the clade_name column to the first column.
phyla %>% left_join(metadata_select, by = c("sample" = "SampleID")) %>% filter(Group != 'Carrier') %>% group_by(clade_name, Group, Country) %>% summarise(median_prevalence = median(relative_abundance)) %>% pivot_wider(names_from = c('Country', 'Group'), values_from =  'median_prevalence') %>% dplyr::select(order(colnames(.))) %>% relocate(clade_name, .before = 1) %>% kbl() %>% kable_styling()
clade_name Bangladesh_Acute_Typhi Bangladesh_Control_HealthySerosurvey Malawi_Acute_Typhi Malawi_Control_HealthySerosurvey Nepal_Acute_Typhi Nepal_Control_HealthySerosurvey
k__Archaea|p__Candidatus_Thermoplasmatota 0.000000 0.000000 0.000000 0.000000 0.00000 0.00000
k__Archaea|p__Euryarchaeota 0.000000 0.000000 0.249010 0.000000 0.00000 0.22474
k__Archaea|p__Thaumarchaeota 0.000000 0.000000 0.000000 0.000000 0.00000 0.00000
k__Bacteria|p__Acidobacteria 0.000000 0.000000 0.000000 0.000000 0.00000 0.00000
k__Bacteria|p__Actinobacteria 14.361185 13.136440 1.893725 0.524965 3.33634 3.81550
k__Bacteria|p__Bacteria_unclassified 0.000000 0.000000 0.000000 0.000000 0.00000 0.00000
k__Bacteria|p__Bacteroidetes 0.911970 2.216730 15.361165 34.630185 28.24679 20.53695
k__Bacteria|p__Candidatus_Melainabacteria 0.000000 0.000000 0.038085 0.052210 0.00000 0.02157
k__Bacteria|p__Candidatus_Saccharibacteria 0.017310 0.015625 0.000000 0.000000 0.00000 0.00203
k__Bacteria|p__Chloroflexi 0.000000 0.000000 0.000000 0.000000 0.00000 0.00000
k__Bacteria|p__Elusimicrobia 0.000000 0.000000 0.000000 0.000000 0.00000 0.00000
k__Bacteria|p__Firmicutes 80.670075 72.161565 60.161350 58.311770 53.74090 60.17250
k__Bacteria|p__Fusobacteria 0.000000 0.000000 0.000000 0.000000 0.00000 0.00000
k__Bacteria|p__Lentisphaerae 0.000000 0.000000 0.000000 0.000000 0.00000 0.00000
k__Bacteria|p__Proteobacteria 0.629715 1.272295 6.606440 6.195335 2.90174 2.52509
k__Bacteria|p__Spirochaetes 0.000000 0.000000 0.000000 0.000000 0.00000 0.00000
k__Bacteria|p__Synergistetes 0.000000 0.000000 0.000000 0.000000 0.00000 0.00000
k__Bacteria|p__Tenericutes 0.000000 0.000000 0.002715 0.000000 0.01965 0.01362
k__Bacteria|p__Verrucomicrobia 0.000000 0.000000 0.000000 0.000000 0.00000 0.00000
k__Eukaryota|p__Ascomycota 0.000000 0.000000 0.000000 0.000000 0.00000 0.00000
k__Eukaryota|p__Eukaryota_unclassified 0.000000 0.000000 0.000000 0.000000 0.00000 0.00000

1.8 Taxa associated with participant groups

1.8.1 basics, stats and plots

Maaslin basics

groups_to_analyse <- c('Acute_Typhi', 'Control_HealthySerosurvey')
bang_variables_for_analysis <- c("Group", "Sex", "Age", "Antibiotics_taken_before_sampling_yes_no_assumptions")
mwi_variables_for_analysis <- c("Group", "Sex", "Age", "Antibiotics_taken_before_sampling_yes_no_assumptions", "sequencing_lane")
nep_variables_for_analysis <- c("Group", "Sex", "Age", "Antibiotics_taken_before_sampling_yes_no_assumptions")
bangladesh_taxonomic_maaslin <- read_in_maaslin('Bangladesh', groups_to_analyse, bang_variables_for_analysis)
malawi_taxonomic_maaslin <- read_in_maaslin('Malawi', groups_to_analyse, mwi_variables_for_analysis)
nepal_taxonomic_maaslin <- read_in_maaslin('Nepal', groups_to_analyse, nep_variables_for_analysis)
bangladesh_taxonomic_maaslin_filtered <- filter_taxonomic_maaslin(bangladesh_taxonomic_maaslin)
malawi_taxonomic_maaslin_filtered <- filter_taxonomic_maaslin(malawi_taxonomic_maaslin)
nepal_taxonomic_maaslin_filtered <- filter_taxonomic_maaslin(nepal_taxonomic_maaslin)
bangladesh_maaslin_stats <- basic_maaslin_stats(bangladesh_taxonomic_maaslin_filtered, 'Bangladesh', bang_variables_for_analysis, groups_to_analyse)
malawi_maaslin_stats <- basic_maaslin_stats(malawi_taxonomic_maaslin_filtered, 'Malawi', mwi_variables_for_analysis, groups_to_analyse)
nepal_maaslin_stats <- basic_maaslin_stats(nepal_taxonomic_maaslin_filtered, 'Nepal', nep_variables_for_analysis, groups_to_analyse)
malawi_maaslin_stats$volcano_plot / bangladesh_maaslin_stats$volcano_plot / nepal_maaslin_stats$volcano_plot

nrow(malawi_maaslin_stats$maaslin_results_sig)
## [1] 296
nrow(bangladesh_maaslin_stats$maaslin_results_sig)
## [1] 23
nrow(nepal_maaslin_stats$maaslin_results_sig)
## [1] 0

There were 296 species significantly (q-val < 0.05) associated with health/disease in Malawi, in Bangladesh, and in Nepal.

Combine the taxonomic maaslins, and print out the species that are sig in both and share directions.

Because sequencing run and participant type are totally confounded for Bangladesh, need to exclude sequencing run from the final model for Bangladesh (otherwise, wipes out the signals).

between acute <-> health.

### associated at both sites

combined_results <- run_combine_maaslins(bangladesh_taxonomic_maaslin_filtered, malawi_taxonomic_maaslin_filtered, bang_variables_for_analysis, mwi_variables_for_analysis, groups_to_analyse, 'metaphlan', maaslin_taxonomic_output_root_folder)

combined_results$positive_coef %>% filter(grepl('^s', lowest_taxonomic_level)) %>% 
  select(!c(metadata, value, N_bang, N.not.0_bang, pval_bang, N_mal, N.not.0_mal, pval_mal)) %>% 
  rename(Species = lowest_taxonomic_level, `Coefficient Bangladesh` = coef_bang, `Standard Error Bangladesh` = stderr_bang, `Q-value Bangladesh` = qval_bang, `Coefficient Malawi` = coef_mal, `Standard Error Malawi` = stderr_mal, `Q-value Malawi` = qval_mal) %>% 
  dplyr::mutate_if(is.numeric, funs(as.character(signif(., 3)))) %>% 
  kbl() %>% kable_styling()
feature Species Coefficient Bangladesh Standard Error Bangladesh Q-value Bangladesh Coefficient Malawi Standard Error Malawi Q-value Malawi
k__Bacteria.p__Firmicutes.c__Clostridia.o__Eubacteriales.f__Clostridiaceae.g__Clostridium.s__Clostridium_SGB6179 s__Clostridium_SGB6179 8.79 1.34 4.25e-05 3.11 0.936 0.0286
k__Bacteria.p__Bacteroidetes.c__Bacteroidia.o__Bacteroidales.f__Prevotellaceae.g__Prevotella.s__Prevotella_copri_clade_A s__Prevotella_copri_clade_A 4.54 0.978 0.00971 7.64 1.25 1.21e-05
k__Bacteria.p__Firmicutes.c__Negativicutes.o__Veillonellales.f__Veillonellaceae.g__GGB4266.s__GGB4266_SGB5809 s__GGB4266_SGB5809 4.58 1.09 0.0147 6.94 0.981 4.38e-07
k__Bacteria.p__Proteobacteria.c__Gammaproteobacteria.o__Pasteurellales.f__Pasteurellaceae.g__Haemophilus.s__Haemophilus_parainfluenzae s__Haemophilus_parainfluenzae 3.64 0.979 0.03 6.92 0.953 2.26e-07
k__Bacteria.p__Firmicutes.c__Clostridia.o__Eubacteriales.f__Peptostreptococcaceae.g__Romboutsia.s__Romboutsia_timonensis s__Romboutsia_timonensis 4.52 1.25 0.0386 5.07 0.903 5.91e-05
combined_results$negative_coef %>% filter(grepl('^s', lowest_taxonomic_level)) %>% 
  select(!c(metadata, value, N_bang, N.not.0_bang, pval_bang, N_mal, N.not.0_mal, pval_mal)) %>% 
  rename(Species = lowest_taxonomic_level, `Coefficient Bangladesh` = coef_bang, `Standard Error Bangladesh` = stderr_bang, `Q-value Bangladesh` = qval_bang, `Coefficient Malawi` = coef_mal, `Standard Error Malawi` = stderr_mal, `Q-value Malawi` = qval_mal) %>% 
  dplyr::mutate_if(is.numeric, funs(as.character(signif(., 3)))) %>% 
  kbl() %>% kable_styling()
feature Species Coefficient Bangladesh Standard Error Bangladesh Q-value Bangladesh Coefficient Malawi Standard Error Malawi Q-value Malawi
k__Bacteria.p__Firmicutes.c__Clostridia.o__Eubacteriales.f__Lachnospiraceae.g__Lachnospiraceae_unclassified.s__Lachnospiraceae_bacterium s__Lachnospiraceae_bacterium -3.05 0.838 0.0366 -2.87 0.785 0.0135
nrow(combined_results$mwi_maaslin_only)
## [1] 290
nrow(combined_results$bang_maaslin_only)
## [1] 17

There were 290 species significantly (q-val < 0.05) associated with health/disease in Malawi only and 17 in Bangladesh only.

### associated at only one site

The ones associated at only one site are written out to a file, you can look at them manually there.

1.8.2 plots of species of interest

strataa_metaphlan_data_longer <- strataa_metaphlan_data %>% mutate(feature = rownames(strataa_metaphlan_data)) %>% pivot_longer(!c(feature, lowest_taxonomic_level), names_to = "SampleID", values_to = "prevalence")
strataa_metaphlan_data_longer_meta <- strataa_metaphlan_data_longer %>% left_join(strataa_metaphlan_metadata, by = c("SampleID" = "SampleID"))


pc <- run_plot_species_of_interest(strataa_metaphlan_data_longer_meta, 's__Prevotella_copri_clade_A')

cs <- run_plot_species_of_interest(strataa_metaphlan_data_longer_meta, 's__Clostridium_SGB6179')

SGB5809 <-run_plot_species_of_interest(strataa_metaphlan_data_longer_meta, 's__GGB4266_SGB5809')

hp <- run_plot_species_of_interest(strataa_metaphlan_data_longer_meta, 's__Haemophilus_parainfluenzae')

rt <- run_plot_species_of_interest(strataa_metaphlan_data_longer_meta, 's__Romboutsia_timonensis')

lb <- run_plot_species_of_interest(strataa_metaphlan_data_longer_meta, 's__Lachnospiraceae_bacterium')

# pc + cs + SGB5809 + hp + rt + lb

1.9 P. copri clades

Print the results for all the P. copri clades

all_taxa_maaslin <- combined_maaslins$all_features
p_copri_maaslin <- all_taxa_maaslin %>% filter(grepl('_Prevotella_copri_', feature)) %>% filter(qval_bang < 0.05 | qval_mal < 0.05)
write_csv(p_copri_maaslin, file.path(maaslin_taxonomic_output_root_folder, 'combined_maaslins_p_copri.csv'))

1.10 maaslin functional groups

Functional groups associated with health

# This needs to be replaced by combine_maaslins

variables_for_analysis <- c("Group", "Gender", "Age", "Antibiotics_taken_before_sampling_yes_no_assumptions")
vars_for_dirname <- paste(variables_for_analysis, collapse = '.')
groups_to_analyse <- c('Acute_Typhi', 'Control_Healthy')

bang_maaslin_output_dir <- file.path(maaslin_functional_output_root_folder, paste('Bangladesh', paste(groups_to_analyse, collapse = '_vs_'), vars_for_dirname, sep = '_'))

malawi_maaslin_output_dir <- file.path(maaslin_functional_output_root_folder, paste('Malawi', paste(groups_to_analyse, collapse = '_vs_'), vars_for_dirname, sep = '_'))

bang_maaslin <- read_delim(file.path(bang_maaslin_output_dir, "all_results.tsv"), delim = "\t", escape_double = FALSE, trim_ws = TRUE)
malawi_maaslin <- read_delim(file.path(malawi_maaslin_output_dir, "all_results.tsv"), delim = "\t", escape_double = FALSE, trim_ws = TRUE)

combined_maaslins <- combine_maaslins(bang_maaslin, malawi_maaslin)

combined_maaslins_positive_coef <- combined_maaslins$positive_coef
combined_maaslins_negative_coef <- combined_maaslins$negative_coef

# hacky as hell thing to split the feature name to extract the organism and the enzyme class.
# because separate isn't working with .. or \\.\\. which is the actual separator used.
combined_maaslins_positive_coef <- combined_maaslins_positive_coef %>% separate_wider_delim(feature, delim = 'Entryname.', names_sep = '', cols_remove = FALSE) %>% separate_wider_delim(feature2, delim = '..OS.', names_sep = '') %>% separate_wider_delim(feature22, delim = '..SMASH', names_sep = '') %>% select(!c(feature1, feature222)) %>% rename(MGC_class = feature21, Species = feature221, feature = featurefeature)
combined_maaslins_negative_coef <- combined_maaslins_negative_coef %>% separate_wider_delim(feature, delim = 'Entryname.', names_sep = '', cols_remove = FALSE) %>% separate_wider_delim(feature2, delim = '..OS.', names_sep = '') %>% separate_wider_delim(feature22, delim = '..SMASH', names_sep = '') %>% select(!c(feature1, feature222)) %>% rename(MGC_class = feature21, Species = feature221, feature = featurefeature)

combined_maaslins_positive_coef %>%  kbl() %>% kable_styling()
combined_maaslins_negative_coef %>%  kbl() %>% kable_styling()

write_csv(combined_maaslins_positive_coef, file.path(maaslin_functional_output_root_folder, 'combined_maaslins_positive_coef.csv'))
combined_maaslins_negative_coef %>%  kbl() %>% kable_styling()
write_csv(combined_maaslins_negative_coef, file.path(maaslin_functional_output_root_folder, 'combined_maaslins_negative_coef.csv'))
groups_to_analyse <- c('Carrier', 'Control_HealthySerosurvey')
mwi_variables_for_analysis <- c("Group", "Sex", "Age")
bang_variables_for_analysis <- c("Group", "Sex", "Age")
run_combine_maaslins(groups_to_analyse, mwi_variables_for_analysis, bang_variables_for_analysis, maaslin_functional_output_root_folder, 'bigmap')